# download_live_crl.py
# CRL (College and Research Libraries) Downloader
# Automates downloading PDFs from CRL (OJS-based journal)
# - Parses issue page
# - Filters only article sections
# - Applies dynamic filename sanitization
# - Downloads PDFs with clean titles

import requests
from bs4 import BeautifulSoup
import re
import os
from urllib.parse import urljoin

def sanitize(title):
    return re.sub(r'[\\/:*?"<>|]', '', title)

issue_toc_url = input("Enter the CRL issue URL: ").strip()

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36"
}

resp = requests.get(issue_toc_url, headers=headers)
soup = BeautifulSoup(resp.text, 'html.parser')

base_url = "https://crl.acrl.org"
os.makedirs("CRL_PDFs_Live", exist_ok=True)

# Find Articles section
articles_section = None
for header in soup.find_all(['h3', 'h4'], class_='tocSectionTitle'):
    if "Articles" in header.get_text():
        articles_section = header.find_next('table')
        break

if not articles_section:
    print("❌ No Articles section found.")
    exit()

for tr in articles_section.select('tr'):
    title_tag = tr.select_one('div.tocTitle a')
    pdf_tag = tr.select_one('div.tocGalleys a.file')

    if not title_tag or not pdf_tag:
        continue

    title = title_tag.get_text(strip=True)
    galley_url = urljoin(base_url, pdf_tag['href'])
    print(f"Fetching galley for: {title}")

    r = requests.get(galley_url, headers=headers)

    # ✅ If the galley URL itself returns a PDF
    if 'application/pdf' in r.headers.get('Content-Type', '').lower():
        pdf_data = r.content

    else:
        # Parse HTML for /download/ link
        galley_soup = BeautifulSoup(r.text, 'html.parser')
        download_link = galley_soup.find('a', href=lambda h: h and '/download/' in h)

        if download_link:
            pdf_url = urljoin(base_url, download_link['href'])
            print(f"   Detected download URL: {pdf_url}")
            pdf_data = requests.get(pdf_url, headers=headers).content
        else:
            # Fallback to iframe/object/embed
            pdf_data = None
            for tag in galley_soup.find_all(['iframe', 'object', 'embed']):
                if 'src' in tag.attrs and tag['src'].endswith('.pdf'):
                    pdf_url = urljoin(base_url, tag['src'])
                    print(f"   Detected embedded PDF: {pdf_url}")
                    pdf_data = requests.get(pdf_url, headers=headers).content
                    break

            if not pdf_data:
                print(f"⚠️ Could not find PDF for: {title}")
                continue

    fname = sanitize(title) + ".pdf"
    with open(os.path.join("CRL_PDFs_Live", fname), "wb") as f:
        f.write(pdf_data)

    print(f"✅ Saved: {fname}")

print("\nAll done! PDFs are in CRL_PDFs_Live folder.")
